---
title: "Figures in Main Text"
output: github_document
editor_options: 
  chunk_output_type: console
always_allow_html: true
---

```{r setup, include = FALSE}
# code chunk settings
knitr::opts_chunk$set(echo = TRUE,
                      warning = FALSE,
                      message = FALSE,
                      fig.width = 12,
                      fig.align = 'center')

usethis::use_blank_slate()
# Suppress summarise info
options(dplyr.summarise.inform = FALSE)

### ----------- packages
pacman::p_load(tidyverse,
               data.table,
               sjlabelled,
               sjmisc,
               psych,
               lubridate,
               # estimation
               survey,
               sandwich,
               lmtest,
               # plotting / tables
               waffle,
               cowplot, 
               purrr,
               texreg,
               ggimage,
               ggpattern,
               kableExtra,
               stargazer)
```


```{r data}
# survey + extension data for individuals
merged_data <- read_rds("data/yg_browser_cces_merged.rds")

# activity data
activity_data <- read_rds("data/activity_yg_cces.rds") 

# subscriptions data (see build.R for construction of this table)
summarize_subscribe_table <-
  read_csv("data/summarize_subscribe_table_wtd.csv")

# referrers data (see build.R for construction of these tables)
on_platform_referrers_by_channel <- 
  read_csv('data/on_platform_referrers_by_channel_wtd.csv')
aggregated_referrers_by_channel <- 
  read_csv('data/aggregated_referrers_by_channel_wtd.csv')

# list of referrers (see appendix figs script for list of referrers)
referrers_list <- read_rds('data/referrers_list.rds')
external_referrers <- referrers_list$external_referrers
internal_referrers_other <- referrers_list$internal_referrers_other
internal_referrers <- referrers_list$internal_referrers

# recommendations data (see build.R for construction of this table)
recs_data <- 
  read_delim("data/recommendation_pipeline_wtd.tsv", delim = '\t')
```


```{r helper functions, include = TRUE}
source("helper_fxns.R") # these are mostly for plotting and presentation
```


```{r define variables}
# attach survey weights
svy_at <- svydesign(ids = ~ 1,
                    data = activity_data,
                    weights = ~ weight_cmd)

# color palette 
color_palette <- c("#FFA500", "#CD5C5C", "#015CB9", "#E3E6E6")

# all channel categoies
channel_types <- c("alternative",
                   "extremist",
                   "mainstream",
                   "other")

# dependent variables (time spent per week) by channel type
minutes_activity_time_all_week <- c(
    "minutes_activity_yt_video_time_elapsed_capped_total_alternative_all_week",
    "minutes_activity_yt_video_time_elapsed_capped_total_extremist_all_week",
    "minutes_activity_yt_video_time_elapsed_capped_total_mainstream_all_week",
    "minutes_activity_yt_video_time_elapsed_capped_total_other_all_week"
  )

# dummies for whether participant watch any of 
any_channel_type <- c("at_alt",
                      "at_ext",
                      "at_msm",
                      "at_other")

# whether participant watched any video that they subscribed to from X channel
any_subscribed_channel_type <- c(
  "at_any_alternative_subscribed",
  "at_any_extremist_subscribed",
  "at_any_mainstream_subscribed",
  "at_any_other_subscribed"
) 
```


```{r weighted summary functions}
weighted_mean_fxn <- function(x,
                              data) {
  svy_at <- svydesign(ids = ~ 1,
                      data = data,
                      weights = ~ weight_cmd)
  
  result <- svymean(~ get(x), design = svy_at, na.rm = T)
  
  median_result <-
    svyquantile(
      ~ get(x),
      quantiles = .5,
      design = svy_at,
      na.rm = T,
      ci = TRUE
    )[[1]][1]
  
  
  output <- tibble(
    channel_type = str_extract(x, paste0(channel_types, collapse = "|")),
    raw_mean = mean(pull(data, get(x)), na.rm = T),
    raw_se = sd(pull(data, get(x)), na.rm = T) / sqrt(nrow(na.omit(data[, x]))),
    weighted_mean = coef(result),
    weighted_median = median_result,
    weighted_se = SE(result)[1],
    lwr95 = weighted_mean - 1.96 * weighted_se,
    upr95 = weighted_mean + 1.96 * weighted_se
  ) %>%
    mutate(
      channel_type = case_when(
        channel_type == "alternative" ~ "Alternative",
        channel_type == "extremist" ~ "Extremist",
        channel_type == "mainstream" ~ "Mainstream",
        channel_type == "other" ~ "Other"
      )
    )
  
  return(output)
}

weighted_prop_fxn <- function(x, data) {
  
  weighted <- data %>%
    group_by(get(x)) %>%
    summarise(total = sum(weight_cmd)) %>%
    na.omit()
  
  result <- prop.table(weighted$total) * 100
  
  result_unweighted <- prop.table(table(data[x])) * 100
  
  output <- tibble(
    channel_type = str_extract(x, "alt|ext|msm|mainstream|other"),
    raw_prop = result_unweighted[2],
    raw_se = sqrt((raw_prop * (100 - raw_prop)) / nrow(na.omit(data[, x]))),
    weighted_prop = result[2],
    weighted_se =  sqrt((weighted_prop * (
      100 - weighted_prop
    )) / nrow(na.omit(data[, x]))),
    lwr95 = weighted_prop - 1.96 * weighted_se,
    upr95 = weighted_prop + 1.96 * weighted_se
  ) %>%
    mutate(
      channel_type = case_when(
        channel_type == "alt" ~ "Alternative",
        channel_type == "ext" ~ "Extremist",
        channel_type == "msm" | channel_type == "mainstream" ~ "Mainstream",
        channel_type == "other" ~ "Other"
      )
    )
  return(output)
}
```



### Exposure level estimates (page 11)

Just 15% of the sample for whom we have browser activity data (n=1,181) viewed any video from an alternative channel and only 6% viewed any video from an extremist channel. By comparison, 44% viewed at least one video from a mainstream media channel. 

```{r}
map_dfr(any_channel_type, ~ weighted_prop_fxn(.x, activity_data)) %>% 
  kbl(digits = 2, 
      format = "html") %>% 
  kable_styling()
```


Among the set of people who saw at least one extremist channel video, for instance, 52% saw at least one video from an extremist channel they subscribe to during the study period. Similarly, 39% of all people who saw at least one alternative channel video viewed at least one video from a channel to which they subscribed. 

```{r}
# dfs with those who viewed any alternative/extremist/mainstream channel
viewed_any_channel_type <- list(
  activity_data %>% filter(at_alt == 1),
  activity_data %>% filter(at_ext == 1),
  activity_data %>% filter(at_msm == 1),
  activity_data %>% filter(at_other == 1)
)

map2_dfr(.x = any_subscribed_channel_type,
         .y = viewed_any_channel_type,
         ~ weighted_prop_fxn(.x, .y)) %>% 
  kbl(digits = 2, 
      format = "html") %>% 
  kable_styling()

```



## Figure 1: Distribution of video views by subscription status and channel type

```{r}
summarize_subscribe_plot <- summarize_subscribe_table %>%
  mutate(subscribed_group = factor(
    subscribed_group,
    levels = c(
      "subscribed to current",
      "subscribed to another",
      "not subscribed"
    )
  ),
  channel_type = case_when(
    str_detect(channel_type, "Mainstream") ~ str_replace(channel_type, 'Mainstream media', 'Mainstream media\nchannel videos'),
    str_detect(channel_type, "Alternative") ~ str_replace(channel_type, 'Alternative channel', 'Alternative channel videos'),
    str_detect(channel_type, "Extremist") ~ str_replace(channel_type, 'Extremist channel', 'Extremist channel videos'),
    str_detect(channel_type, "Other") ~ str_replace(channel_type, 'Other channel', 'Other channel videos')
  )) %>%
  ggplot(aes(
    x = channel_type,
    y = percent,
    fill = channel_type,
    pattern = subscribed_group
  )) +
  geom_col_pattern(
    position = position_dodge2(.9),
    aes(color = channel_type),
    pattern_color = "#212426",
    pattern_fill = "#212426",
    pattern_angle = 45,
    pattern_density = 0.01,
    pattern_spacing = 0.04,
    pattern_key_scale_factor = 0.2
  ) +
  geom_text(aes(label = scales::comma(total_label), y = -5),
            position = position_dodge2(.95),
            size = 2.5) +
  geom_linerange(aes(ymin = ci_lwr95, ymax = ci_upr95),
                 position = position_dodge2(.9),
                 lwd = .75) +
  labs(x = "", y = "Percentage of views") +
  scale_y_continuous(
    limits = c(-10, 100),
    breaks = seq(0, 100, by = 20),
    labels = paste0(seq(0, 100, by = 20), "%")
  ) +
  scale_fill_manual(
    values = color_palette,
    name = "",
    labels = c(
      "Alternative channel",
      "Extremist channel",
      "Mainstream media channel",
      "Other channel"
    )
  ) +
  scale_color_manual(
    values = color_palette,
    name = "",
    labels = c(
      "Alternative channel",
      "Extremist channel",
      "Mainstream media channel",
      "Other channel"
    )
  ) +
  scale_pattern_manual(
    name = "",
    labels = c(
      "Subscribed to current",
      "Subscribed to another",
      "Not subscribed"
    ),
    values = c(
      "subscribed to current" = "crosshatch",
      "subscribed to another" = "stripe",
      "not subscribed" = "none"
    )
  ) +
  theme_minimal() +
  theme(
    strip.text = element_text(hjust = 0),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    legend.position = 'bottom'
  ) +
  guides(
    fill = "none",
    pattern = guide_legend(override.aes = list(fill = "white", color = "black")),
    color = "none"
  )
summarize_subscribe_plot
ggsave('./main-text-figures_files/subscriptions_by_other_subscriptions_wtd.png',
       dpi = 600, width = 11, height = 6)
ggsave('./main-text-figures_files/subscriptions_by_other_subscriptions_wtd.pdf',
       dpi = 600, width = 11, height = 6)
```


```{r fig 1 table, eval=FALSE, include=FALSE}
# this table was created upstream and output as table is read in directly
summarize_subscribe_table %>% 
  separate("channel_type", into = "channel_type", extra = "drop") %>% 
  select("Subscription status" = subscribed_group,
         "Channel type" = channel_type,
         "N" = n,
         "Percent" = percent,
         "Std. Error" = stderr) %>% 
  kbl(digits = 2, 
      booktabs = T,
      format = "latex") %>% 
  kable_styling() %>% 
  save_kable("./main-text-figures_files/subscriptions_by_other_subscriptions_wtd.tex",float = FALSE)
```

If we instead define subscribers to include all people who subscribe to at least one channel of the type in question, the proportion of views from subscribers increases to 88% (92.9% weighted) for alternative channels and 89% for extremist channels (84.7% weighted).

```{r}
summarize_subscribe_table %>%
  filter(subscribed_group!="not subscribed") %>%
  group_by(channel_type) %>%
  summarise(sum(percent))
```



### Exposure level estimates (page 12)

Among the participants who viewed at least one video from an alternative or extremist channel, the time spent watching them was relatively low: 26 minutes per week for alternative channel videos and 8 minutes for extremist channel videos. The comparison statistics are 12 minutes per week for mainstream media channel videos and 214 minutes per week for videos from other channels

```{r}
map2_dfr(.x = minutes_activity_time_all_week,
         .y = viewed_any_channel_type,
         ~ weighted_mean_fxn(.x, .y)) %>%
  kbl(format = "html",
      digits = 2) %>% 
  kable_styling()
```

(62 minutes per week for subscribers to one or more alternative channels [6%] versus 0.2 minutes per week for non-subscribers [9%]) and (15 minutes per week for subscribers [3%] versus 0.04 minute per week for non-subscribers [3%]).

```{r}
# dfs for those subsscribed to any X channel
subscribed_any_channel_type <- list(
  activity_data %>% filter(at_any_alternative_subscribed == 1),
  activity_data %>% filter(at_any_extremist_subscribed == 1),
  activity_data %>% filter(at_any_mainstream_subscribed == 1),
  activity_data %>% filter(at_any_other_subscribed == 1)
)

# dummy for NOT subsscribed to any X channel
notsubscribed_any_channel_type <- list(
  activity_data %>% filter(at_any_alternative_subscribed != 1),
  activity_data %>% filter(at_any_extremist_subscribed != 1),
  activity_data %>% filter(at_any_mainstream_subscribed != 1),
  activity_data %>% filter(at_any_other_subscribed != 1)
)

# calculate weighted mean for each channel type
subscriber_estimates <- map2_dfr(.x = minutes_activity_time_all_week,
                                 .y = subscribed_any_channel_type,
                                 ~ weighted_mean_fxn(.x, .y)) %>%
  mutate(subscription_status = paste0(channel_types,
                                      " sub"))

nonsubscriber_estimates <- map2_dfr(.x = minutes_activity_time_all_week,
                                    .y = notsubscribed_any_channel_type,
                                    ~ weighted_mean_fxn(.x, .y))  %>%
  mutate(subscription_status = paste0(channel_types,
                                      " non-sub"))

bind_rows(subscriber_estimates,
          nonsubscriber_estimates) %>% 
  kbl(format = "html",
      digits = 3) %>% 
  kable_styling()

```



## Figure 2. Concentration of exposure to alternative and extremist channels

```{r fig.height = 8}
#cumulative time spent on that channel and the cumulative user 
cumsum_fxn <- function (var, data) {
  channel_type <-
      str_replace(str_extract(var, "[a-z]+_all_week$"), "_all_week", "")
  data %>%
    arrange(desc(.data[[var]])) %>%
    mutate(
      users = weight_cmd / sum(.$weight_cmd, na.rm = T),
      views = (.data[[var]] / sum(.data[[var]], na.rm = T)),
      cum_user = cumsum(users),
      cum_views = cumsum(views),
      ln_cum_user = log10(cumsum(users)),
      ln_cum_views = log10(cumsum(views)),
      source = channel_type
    ) %>%
    select(cum_user, cum_views, ln_cum_user, source, caseid, weight_cmd)
}

# calculate for all channel types
concentration_time_user <- map_dfr(minutes_activity_time_all_week, 
                            ~cumsum_fxn(.x, activity_data))

# get the %user for 80% watch time
lab_stat <-
  cumsum_fxn('minutes_activity_yt_video_time_elapsed_capped_total_alternative_all_week',
             activity_data) %>%
  filter(round(cum_views, 2) == .8) %>%
  pull(ln_cum_user)

# log 10 on x-axis
time_cumsum_plot_inset <- concentration_time_user %>%
  ggplot(aes(x = ln_cum_user, y = cum_views, color = source)) +
  geom_line(size = 2) +
  geom_point(
    aes(x = lab_stat, y = .8),
    color = 'black',
    shape = 4,
    stroke = 1,
    size = 1.5
  ) +
  scale_y_continuous(labels = scales::percent) +
  scale_color_manual(
    name = "",
    labels = c(
      "Alternative\nchannel videos",
      "Extremist\nchannel videos",
      "Mainstream media\nchannel videos",
      "Other\nchannel videos"
    ),
    values = color_palette
  ) +
  geom_segment(aes(
    x = -4,
    xend = lab_stat,
    y = .8,
    yend = .8
  ),
  lty = 2,
  color = 'black') +
  geom_segment(aes(
    x = lab_stat,
    xend = lab_stat,
    y = 0,
    yend = .8
  ),
  lty = 2,
  color = 'black') +
  geom_curve(
    aes(
      x = -2.5,
      xend = lab_stat,
      y = .9,
      yend = .8
    ),
    curvature = 0.25,
    angle = 35,
    color = 'black'
  ) +
  scale_x_continuous(
    limits = c(-4, 0),
    breaks = seq(-4, 0,
                 by = 1),
    labels = paste0((round(10 ^ (
      seq(-4, 0,
          by = 1)
    ), 3)) * 100, "%")
  ) +
  annotate(
    "text",
    x = -3,
    y = .95,
    label = str_wrap(paste0(
      round(10^(lab_stat) * 100, 1),
      "% of users account for 80% of time \nspent viewing alternative channel videos."
    ), width = 45),
    size = 2.5
  ) +
  labs(x = expression(log[10]~ "(Percentage of users)"), 
       y = "Percentage of total exposure (minutes)") +
  theme_minimal() +
  theme(
    plot.background = element_blank(),
    legend.position = 'bottom',
    axis.title.x = element_text(size = 10),
    axis.title.y = element_text(size = 10)
  )

# without logging x
time_cumsum_plot_zoomout <- concentration_time_user %>%
  ggplot(aes(x = cum_user, y = cum_views, color = source)) +
  geom_line(size = 2) +
  scale_y_continuous(labels = scales::percent) +
  scale_x_continuous(labels = scales::percent) +
  scale_color_manual(
    name = "",
    labels = c(
      "Alternative\nchannel videos",
      "Extremist\nchannel videos",
      "Mainstream media\nchannel videos",
      "Other\nchannel videos"
    ),
    values = color_palette
  ) +
  annotate(geom = "rect", col = "black", fill = "white",
           lwd = 2,
           ymin = 0, ymax = .87, xmin = .28, xmax = 1) +
  labs(x = "Percentage of users", y = "Percentage of total exposure (minutes)") +
  theme_minimal() +
  theme(
    plot.background = element_blank(),
    legend.position = 'bottom',
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12)
  )

# combine inset (logged) and non-log 
time_cumsum_plot_zoomout +
  patchwork::inset_element(time_cumsum_plot_inset +
                             guides(color = "none"), 
                           left = 0.3, bottom = 0.05, right = .95, top = .85,
                           on_top = TRUE)

ggsave('./main-text-figures_files/cdf_users_time_exposure.png',
      dpi = 600, width = 8, height = 6)

ggsave('./main-text-figures_files/cdf_users_time_exposure.pdf',
      dpi = 600, width = 8, height = 6)

```

### Exposure level estimates (page 13)

```{r}
# join with rest of survey data
activity_data_supers <- activity_data %>%
  left_join(
    concentration_time_user %>%
      select(caseid, cum_views,  source) %>%
      pivot_wider(
        names_from = "source",
        names_glue = "cumsum_{source}",
        values_from = "cum_views"
      ),
    by = "caseid"
  ) %>%
  # define a superconsumer as someone who is in the top 80th percentile of watch time per week
  mutate(
    super_alternative = if_else(cumsum_alternative <= .8, 1, 0),
    super_extremist = if_else(cumsum_extremist <= .8, 1, 0),
    super_mainstream = if_else(cumsum_mainstream <= .8, 1, 0)
  )

# select the top alternative super consumers
top_time_all_weeks_most_alt <- activity_data_supers  %>%
  filter(super_alternative == 1)  %>%
  arrange(desc(
    minutes_activity_yt_video_time_elapsed_capped_total_alternative_all_week
  )) %>%
  mutate(rank = 1:nrow(.)) %>%
  pivot_longer(cols = all_of(minutes_activity_time_all_week)) %>%
  mutate(
    channel_type = str_replace(name, "_all_week$", ""),
    channel_type = str_extract(channel_type, "[a-z]+$"),
    minutes_value = value,
    image = if_else(
      super_extremist == 1 ,
      "https://i.postimg.cc/bwL3hjPY/user-icon-extremist.png",
      NA_character_
    )
  )

# select the top extremist super consumers
top_time_all_weeks_most_ext <- activity_data_supers  %>%
  filter(super_extremist == 1)  %>%
  arrange(desc(
    minutes_activity_yt_video_time_elapsed_capped_total_extremist_all_week
  )) %>%
  mutate(rank = 1:nrow(.)) %>%
  pivot_longer(cols = all_of(minutes_activity_time_all_week)) %>%
  mutate(
    channel_type = str_replace(name, "_all_week$", ""),
    channel_type = str_extract(channel_type, "[a-z]+$"),
    minutes_value = value,
    image = if_else(
      super_alternative == 1 ,
      "https://i.postimg.cc/D0BPKrVj/user-icon-alternative.png",
      NA_character_
    )
  )
```


1.6% of participants (17 people unweighted, 18.4 weighted) account for 79% of total time spent on videos from alternative channels.

```{r}
con_super_alt <- concentration_time_user %>% 
  filter(source == "alternative" & cum_views <= .8)

con_super_alt %>% 
  pull(cum_user) %>% 
  max()

# unweighted n
con_super_alt %>% 
  nrow()

# weighted n
con_super_alt %>% 
  summarise(sum(weight_cmd))
```

This imbalance is even more severe for extremist channels, where 0.6% of participants (9 people unweighted, 7.65 weighted) were responsible for 80% of total time spent on these videos.

```{r}
con_super_ext <- concentration_time_user %>% 
  filter(source == "extremist" & cum_views <= .8) 

con_super_ext %>% 
  pull(cum_user) %>% 
  max()

con_super_ext %>% 
  nrow()

con_super_ext %>% 
  summarise(sum(weight_cmd))
```


### Exposure level estimates (Superconsumers in appendix)

Alternative channel superconsumers spend a weighted median of 29 hours (1715 minutes) each week watching YouTube.

```{r}
alternative_superconsumers <- activity_data_supers %>% 
  filter(super_alternative == 1) 

weighted_mean_fxn("minutes_at_yt_video_time_elapsed_capped_total_week",
                  data = alternative_superconsumers) %>% 
  pull(weighted_median) 
```


Extremist channel superconsumers spend a median of 16 hours (979 minutes) each week watching YouTube.

```{r}
extemist_superconsumers <- activity_data_supers %>% 
  filter(super_extremist == 1) 

weighted_mean_fxn("minutes_at_yt_video_time_elapsed_capped_total_week",
                  data = extemist_superconsumers)  %>% 
  pull(weighted_median) 
```

Median time per week across all participants is 0.2 hours (14 minutes).

```{r}
weighted_mean_fxn("minutes_at_yt_video_time_elapsed_capped_total_week",
                  data = activity_data_supers)  %>% 
  pull(weighted_median) 
```

Number of alternative superconsumers is 17 (weighted 18.4).

```{r}
activity_data_supers %>% 
  filter(super_alternative == 1) %>% 
  nrow()
```

Number of extremist superconsumers is 9 (weighted 7.65).

```{r}
activity_data_supers %>% 
  filter(super_extremist == 1) %>% 
  nrow()
```


## Figure 3. Predictors of watch time

```{r formulas}
# DV is time elapsed on channel video
f_time_alternative_all <- formula(minutes_activity_yt_video_time_elapsed_capped_total_alternative_all ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race)
f_time_extremist_all <- formula(minutes_activity_yt_video_time_elapsed_capped_total_extremist_all ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race )
f_time_mainstream_all <- formula(minutes_activity_yt_video_time_elapsed_capped_total_mainstream_all ~  rr_cts + jw_cts + fem_cts + age + gender + educ2 + race)

time_fs <- list(
  f_time_alternative_all,
  f_time_extremist_all,
  f_time_mainstream_all
)
```


```{r estimate quasipoisson}
# function to calculate rocust quasipoisson 
robust_weighted_quasipoisson <- function(data = activity_data,
                                         formula,
                                         robust_output = TRUE) {
  fit <- glm(
    formula,
    family = quasipoisson(link = "log"),
    offset = log(week),
    data = data,
    weights = weight_cmd
  )
  
  results <- fit
  
  if (robust_output == TRUE) {
    cov_mod <- vcovHC(fit, type = "HC0")
    std_err <- sqrt(diag(cov_mod))
    q_val <- qnorm(0.975) # stick to 95% CI
    
    results <- bind_cols(
      predictor = fit$coefficients %>% names(),
      estimate = coef(fit),
      robust_se = std_err,
      z = (coef(fit) / std_err),
      p_val = 1.96 * pnorm(abs(coef(fit) / std_err), lower.tail = FALSE),
      ci_lwr = coef(fit) - q_val  * std_err,
      ci_upr = coef(fit) + q_val  * std_err,
    ) %>%
      mutate(
        stat_sig = if_else(p_val < .05, "*", ""),
        channel_type = str_extract(
          as.character(formula)[2],
          "alternative|extremist|mainstream"
        )
      )
  }
  return(results)
}

QP_time_fit <- map(1:length(time_fs), 
                   ~robust_weighted_quasipoisson(formula = time_fs[[.x]]))

names(QP_time_fit) <-
  c("time_alternative_full",
    "time_extremist_full",
    "time_mainstream_full")

coef_names <- c("Intercept",
                "Racial resentment",
                "Feeling Jews",
                "Hostile sexism",
                "Age",
                "Male",
                "Some college",
                "Bachelor's degree",
                "Post-grad",
                "Non-white")
```



```{r quasipoisson n}
# need to take original model object without robust standard errors first, then separately estimate robust covariance matrix 
QP_time_fit_noSE <- map(time_fs,
                        ~robust_weighted_quasipoisson(formula = .x,
                                                      robust_output = FALSE))
names(QP_time_fit_noSE) <-
  c("time_alternative_full",
    "time_extremist_full",
    "time_mainstream_full")
```

##### dispersion

```{r dispersion}
for (m in 1:length(time_fs)) {
  phi <- # calculate ratio between deviance residuals and residual degrees of freedom
    sum(residuals(QP_time_fit_noSE[[m]], type = "pearson") ^ 2) / QP_time_fit_noSE[[m]]$df.residual
  print(phi)
}
```


##### check multicollinearity

Compute Generalized VIFs ($VIF ^{(1/(2*df))}$) that makes adjustment for categorical variables.

```{r vif}
map(1:length(QP_time_fit_noSE),  ~car::vif(QP_time_fit_noSE[[.x]]))
```



```{r coefficient plot, fig.height = 9}
time_models <- bind_rows(QP_time_fit)

time_models %>%
  filter(predictor != "(Intercept)") %>%
  mutate(predictor = refactor_fxn(recode_fxn(predictor))) %>%
  ggplot(aes(x = estimate, y = str_wrap_factor(predictor, width = 12))) +
  geom_vline(xintercept = 0, lty = 2) +
  geom_linerange(
    aes(xmin = ci_lwr, xmax = ci_upr,
        color = channel_type),
    size = 9,
    show.legend = FALSE
  ) +
  geom_point(
    size = 10,
    shape = 21,
    stroke = 1,
    color = "black",
    fill = "#FFFFFF"
  ) +
  geom_text(aes(label = format(round(estimate, 2), nsmall = 2)),
            size = 3.5, color = "black") +
  geom_text(
    aes(label = stat_sig),
    size = 8,
    nudge_x = .45,
    nudge_y = .25
  ) +
  facet_wrap(~ channel_type,
             labeller = as_labeller(
               c(
                 "alternative" = str_wrap("Minutes/week on alternative channel videos", width = 30),
                 "extremist" = str_wrap("Minutes/week on extremist channel videos", width = 30),
                 "mainstream" = str_wrap("Minutes/week on mainstream media channel videos", width = 30)
               )
             )) +
  scale_color_manual(values = c(
    "alternative" = "#FFA500",
    "extremist" = "#CD5C5C",
    "mainstream" = "#015CB9"
  )) +
  labs(y = "", x = "Quasipoisson coefficient") +
  theme_bw() +
  theme(
    strip.text = element_text(size = 12, hjust = .5),
    strip.background = element_rect(fill = "grey", color = "grey"),
    axis.text.y = element_text(size = 12)
  )

ggsave('./main-text-figures_files/qpois_coefficient_time.png',
      dpi = 600, width = 12, height = 6)
```


```{r fig 4 output, include = FALSE, eval=FALSE}
robust_SEs <- time_models$robust_se
robust_pvals<- time_models$p_val
texreg(QP_time_fit_noSE,
       digits = 2,
       booktabs = TRUE, 
       label = "tab:fig4table",
       caption = "Quasipoisson coefficients for correlates of time per week spent on videos from alternative, extremist, and mainstream media channels. Robust standard errors are in parentheses.",
       # texreg reads models as separate lists
       override.pvalues = list(robust_pvals[1:10],
                    robust_pvals[11:20],
                    robust_pvals[21:30]),
       override.se = list(robust_SEs[1:10],
                    robust_SEs[11:20],
                    robust_SEs[21:30]),
       custom.model.names = c("Alternative",
                                 "Extremist",
                                 "Mainstream"),
       custom.coef.map =
         list(
            "fem_cts" = "Hostile sexism",
            "rr_cts" = "Racial resentment",
            "jw_cts" = "Feeling Jews",
            "age" = "Age",
            "genderMale" =  "Male",
            "raceNon-white" = "Non-white",
            "educ2Some college" = "Some college",
            "educ24-year" = "Bachelor's degree",
            "educ2Post-grad" = "Post-grad",
            "(Intercept)" = "Intercept"
         ),
          file = "main-text-figures_files/figure4.tex")
```


## Figure 4. Hostile sexism as predictor of alternative and extremist channel viewing

```{r calculate predicted values}
vars <- c("rr_tercile", "ft_jew_binned", "gender", "educ2", "race", "pid_lean")
modes <- map(vars, function (i) mode_fxn(i))
names(modes) <- vars

get_predictions <- function (model_name,
                             data,
                             model) {
  # new data fixing all other covariates, varying hostile sexism
  new_data <- 
    expand.grid(
          rr_cts = median(activity_data$rr_cts, na.rm = T),
          jw_cts = median(activity_data$jw_cts, na.rm = T),
          fem_cts = seq(
            min(data$fem_cts, na.rm = T),
            max(data$fem_cts, na.rm = T),
            by = .25
          ),
          week = median(activity_data$week, na.rm = T),
          age = median(activity_data$age, na.rm = T),
          gender = modes$gender,
          educ2 = modes$educ2,
          race = modes$race
    )
  # new data matrix with dummies for categorical variables
  mod_mat <-
    expand.grid(
      intercept = 1,
      rr_cts = median(activity_data$rr_cts, na.rm = T),
      jw_cts = median(activity_data$jw_cts, na.rm = T),
      fem_cts = seq(
        min(activity_data$fem_cts, na.rm = T),
        max(activity_data$fem_cts, na.rm = T),
        by = .25
      ),
      age = median(activity_data$age, na.rm = T),
      genderMale = 1.00000,
      `educ2Some college` = 1.00000,
      `educ24-year` = 0,
      `educ2Post-grad` = 0,
      `raceNon-white` = 0
    )
  # make prediction for each level of hostile sexism and constant other covariate values
  preds <-  predict(model[[model_name]],
                    newdata = new_data,
                    type = "link")
  
  link_inverse <- family(model[[model_name]])$linkinv
  
  crit_val95 <-
    qt(0.025, df = df.residual(model[[model_name]]), lower.tail = FALSE)
  
  # get robust covariance matrix
    var_cov_matrix <-
      vcovHC(model[[model_name]], type = "HC0")
    
  # for each level of hostile sexism and constant other covariates, 
  # calculate with robust covariance matrix
  slices <- map(1:nrow(mod_mat), ~ t(mod_mat[.x, ]))
  robust_se_link <-
    sapply(slices, function (x) {
      sqrt(t(x) %*% var_cov_matrix %*% x)
    })
  
  predicted_df <-
    bind_cols(mod_mat,
              tibble(fit_link = preds,
                     robuse_se_link = robust_se_link)) %>%
    mutate(
      fit_response  = link_inverse(fit_link),
      ci_upr95 = link_inverse(fit_link + (crit_val95 * robuse_se_link)),
      ci_lwr95 = link_inverse(fit_link - (crit_val95 * robuse_se_link)),
      mod = model_name,
      response_var = all.vars(formula(model[[model_name]]))[1],
      alt_ext = str_extract(model_name, "^[a-z]+")
    )
  return(predicted_df)
}

```

```{r predicted value plots}
predicted_data_alternative <- get_predictions(
  model_name = "time_alternative_full",
  data = activity_data,
  model = QP_time_fit_noSE
) %>%
  mutate(channel_type = "alternative")

predicted_data_extremist <- get_predictions(
  model_name = "time_extremist_full",
  data = activity_data,
  model = QP_time_fit_noSE
) %>%
  mutate(channel_type = "extremist")

# join alternative and extremist data
predicted_data <-
  bind_rows(predicted_data_alternative, 
            predicted_data_extremist)

# set y-axis limits
predicted_data <- data.table(predicted_data)
predicted_data[channel_type == "alternative", y_min := 0]
predicted_data[channel_type == "alternative", y_max := 2000]
predicted_data[channel_type == "extremist", y_min := 0]
predicted_data[channel_type == "extremist", y_max := 500]


ggplot(predicted_data, aes(x = fem_cts, y = fit_response)) +
  geom_rug(
    data = activity_data %>%
      filter(!is.na(.data[['fem_cts']])),
    aes(x = .data[['fem_cts']], y = 0),
    color = "darkgrey",
    alpha = .4,
    sides = "b",
    position = position_jitter(width = .2),
    inherit.aes = F
  ) +
  geom_ribbon(aes(fill = channel_type,
                  ymin = ci_lwr95 ,
                  ymax = ci_upr95),
              #show.legend = FALSE,
              alpha = .95) +
  geom_line(size = 2) +
  facet_wrap( ~ channel_type,
              labeller = as_labeller(c(
                "alternative" = str_wrap("Alternative channel videos",
                                         width = 30),
                "extremist" = str_wrap("Extremist channel videos",
                                       width = 30)
              )),
              scales = "free_y") +
  geom_blank(aes(y = y_min)) +
  geom_blank(aes(y = y_max)) +
  scale_fill_manual(
    values = c("#FFA500", "#CD5C5C"),
    name = "",
    labels = c(
      "Alternative channel videos \ny-axis = [0, 2000]",
      "Extremist channel videos \ny-axis = [0, 500]"
    )
  ) +
  labs(x = "Hostile sexism scale",
       y = "Expected minutes per week on channel videos") +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12),
    legend.position = "bottom",
    strip.text = element_blank(),
    strip.background = element_blank()
  )

ggsave('./main-text-figures_files/hostile_sexism_predicted.png',
      dpi = 600, width = 12, height = 6)
```


### Correlates of exposure (pages 15--16)

When hostile sexism is at its minimum value of 1, expected levels are 0.4 minutes per week spent watching alternative channel videos and 0.08 minutes for extremist channel videos. These predicted values increase to 383 and 51 minutes, respectively, when hostile sexism is at its maximum value of 5 (with the greatest marginal increases as hostile sexism reaches its highest levels).

```{r}
predicted_data_alternative %>% 
  filter(fem_cts %in% c(min(fem_cts), max(fem_cts)) ) %>% 
  select(fem_cts, fit_response) %>% 
  rename(`Hostile sexism score` = fem_cts,
         `Predicted value` = fit_response) %>% 
  kbl(digits = 2, 
      format = "html") %>% 
  kable_styling()
```

```{r}
predicted_data_extremist %>% 
  filter(fem_cts %in% c(min(fem_cts), max(fem_cts)) ) %>% 
  select(fem_cts, fit_response)%>% 
  rename(`Hostile sexism score` = fem_cts,
         `Predicted value` = fit_response) %>% 
  kbl(digits = 2, 
      format = "html") %>% 
  kable_styling()
```


### Internal and external referrers (page 20)

49% (52.4% weighted) and 51% (46.6% weighted) of referrers to alternative and extremist channel videos, respectively, were off-platform sources compared to 41% (42%) and 44% (41%), respectively, for videos from mainstream media channels and other channels. 

```{r}
on_platform_referrers_by_channel %>% 
  filter(youtube_video_referrers_by_channel_type == "Off-platform") %>% 
  select(channel_type, youtube_video_referrers_by_channel_type, percentage)%>% 
  kbl(digits = 2, 
      format = "html") %>% 
  kable_styling()
```


...we observe homophily across the video types, with 18% (19.6% weighted) of referrers to alternative videos coming from other alternative video, 14% (21.3% weighted) of referrers to extreme videos coming from other extreme videos, and 26% (25.6% weighted) of referrers to mainstream media videos coming from other mainstream media videos

```{r}
on_platform_referrers_by_channel %>% 
  filter(youtube_video_referrers_by_channel_type == channel_type) %>% 
  select(channel_type, youtube_video_referrers_by_channel_type, percentage)%>% 
  kbl(digits = 2, 
      format = "html") %>% 
  kable_styling()
```


Interestingly, we observe 5% (3.8% weighted) of referrals to extreme videos coming from alternative videos, but only 0.7% (0.8% weighted) of referrals from alternative videos coming from extreme videos 

```{r}
on_platform_referrers_by_channel %>% 
  filter(youtube_video_referrers_by_channel_type %in% c("alternative", "extremist") &
           channel_type %in% c("alternative", "extremist")) %>% 
  select(channel_type, youtube_video_referrers_by_channel_type, percentage)%>% 
  kbl(digits = 2, 
      format = "html") %>% 
  kable_styling()
```


Lastly, we observe that alternative, extreme, and mainstream media videos all receive roughly equal referrals from videos in other channels (13--16%) (10.0--12.8% weighted) and other on-platform sources (16--19%) (15.1--19.1% weighted).

```{r}
on_platform_referrers_by_channel %>% 
  filter(youtube_video_referrers_by_channel_type%in% c("other", "Non-video on-platform")&
           channel_type %in% c("alternative", "extremist", "mainstream") ) %>% 
  select(channel_type, youtube_video_referrers_by_channel_type, percentage)%>% 
  kbl(digits = 2, 
      format = "html") %>% 
  kable_styling()
```



```{r eval=FALSE}
### === these are unweighted numbers
# Bootstrap difference between alternative/extremist off-platform referrers (49% and 51%) vs. other off-platform referrers (44%) (page 16). 95% bootstrapped CIs are in square parentheses.
# 
# (CODE ONLY, see how `youtube_referrers_data` is constructed in `build.R`)
# 
# - alternative v. other: 0.0472 [0.0408, 0.0539]
# - extremist v. other: 0.0702 [0.0550, 0.0865]
# - alternative v. mainstream: 0.0812 [0.0738 , 0.0888]
# - extremist v. mainstream: 0.1043 [0.0873 , 0.1196]

alt_ext_oth <- youtube_referrers_data %>%
  select(channel_type, youtube_video_referrers_by_channel_type)

n_bs_samples <- 1000
gen_bootstrp_samples <- function(channel_type1, channel_type2) {
  ind <- sample(1:nrow(alt_ext_oth), replace = TRUE)
  d <- alt_ext_oth[ind,]
  ct1 <- d[d$channel_type ==channel_type1,]
  ct2 <- d[d$channel_type ==channel_type2,]
  p_ct1 <- prop.table(table(ct1$youtube_video_referrers_by_channel_type))['Off-platform']
  p_ct2 <- prop.table(table(ct2$youtube_video_referrers_by_channel_type))['Off-platform']
  d_p <- p_ct1 - p_ct2
  #print(d_p)
  return(d_p)
}
set.seed(20202021)
alt_other_bs_samples <- replicate(
  n = n_bs_samples,
  expr = gen_bootstrp_samples(channel_type1 = 'alternative',
                              channel_type2 = 'other')
)
ext_other_bs_samples <- replicate(
  n = n_bs_samples,
  expr = gen_bootstrp_samples(channel_type1 = 'extremist',
                              channel_type2 = 'other')
  )
alt_msm_bs_samples <- replicate(
  n = n_bs_samples,
  expr = gen_bootstrp_samples(channel_type1 = 'alternative',
                              channel_type2 = 'mainstream')
)
ext_msm_bs_samples <- replicate(
  n = n_bs_samples,
  expr = gen_bootstrp_samples(channel_type1 = 'extremist',
                              channel_type2 = 'mainstream')
)
bs_CIs <- map(
  list(alt_other_bs_samples, 
       ext_other_bs_samples, 
       alt_msm_bs_samples, 
       ext_msm_bs_samples),
  ~quantile(.x, c(0.025,0.975))
)

```


## Figure 5: Recommendation frequency by type of channel being watched

```{r recommendations shown, fig.show = FALSE, results='hide'}
### ===== recommendations shown setup
# recs_prop_table_fxn() is in helper_fxns.R

#inputs to waffle_plot_fxn
alternative_shown_table <-
  recs_prop_table_fxn(channel_type = "alternative",
                      statistic = "shown")
extremist_shown_table <-
  recs_prop_table_fxn(channel_type = "extremist",
                      statistic = "shown")
mainstream_shown_table <-
  recs_prop_table_fxn(channel_type = "mainstream",
                      statistic = "shown")
other_shown_table <- recs_prop_table_fxn(channel_type = "other",
                                         statistic = "shown")
#outputs to waffle_plot_fxn
alternative_recs_shown <-
  waffle_plot_fxn(alternative_shown_table,
                  statistic = "shown",
                  title = "Alternative channel\nvideos")
extremist_recs_shown <-
  waffle_plot_fxn(extremist_shown_table,
                  statistic = "shown",
                  title = "Extremist channel\nvideos")
mainstream_recs_shown <-
  waffle_plot_fxn(mainstream_shown_table,
                  statistic = "shown",
                  title = "Mainstream media\nchannel videos")
other_recs_shown <-
  waffle_plot_fxn(other_shown_table, statistic = "shown",
                  title = "Other channel\nvideos")

# bar on top of waffles to show scale
total_recs_shown_table <- recs_data %>%
  filter(variable == "total recs shown") %>%
  pivot_longer(-variable) %>%
  select(-variable) %>%
  mutate(
    overall_percent = 100 * (value / sum(value)),
    cumulative_percent = cumsum(overall_percent)
  )

bar_recs_shown_plot <- scale_plot_fxn(data = total_recs_shown_table,
                                      segment_y_upper = 10,
                                      statistic = "shown")

# dummy plot for legend
dummy_plot_shown <- alternative_shown_table %>%
  ggplot(aes(fill = factor(shown),
             values = prop * 100))  +
  geom_waffle(
    n_rows = 10,
    size = .5,
    color = "white",
    make_proportional = TRUE,
    radius = unit(5, "pt")
  ) +
  scale_fill_manual(
    values = c(
      'alternative' = "#FFA500",
      'extremist' = "#CD5C5C",
      'mainstream' = "#015CB9",
      'other' = '#E3E6E6'
    ),
    labels = c(
      'Alternative channel\nvideos' ,
      'Extremist channel\nvideos',
      'Mainstream media\nchannel videos',
      'Other channel\nvideos'
    ),
    name = "Recommendations shown to:"
  ) +
  theme(
    plot.title = ggtext::element_markdown(size = 12, hjust = .5),
    legend.position = 'bottom'
    
  )

# same legend
common_legend_shown <- lemon::g_legend(dummy_plot_shown)

# put waffles together
all_waffles_shown <- gridExtra::arrangeGrob(
  alternative_recs_shown,
  extremist_recs_shown,
  mainstream_recs_shown,
  other_recs_shown,
  nrow = 1
) 
```


Recommendations to alternative and extremist channel videos are rare when watching videos from mainstream media or other types of channels, which together make up 97% (98% weighted) of views in our sample. Recommendations to alternative and extremist channel videos are much more common, however, when people are already viewing videos from alternative and extremist channels, which make up 2.6% (3% weighted) and 0.4% (0.5% weighted) of views, respectively. Just over a third (34.6%) (48% weighted) of recommendations when viewing an alternative channel video point to another alternative channel video, while 25.5% (41% weighted) of recommendations follow the same pattern for extremist channel videos. 

```{r recommedations show plot, fig.height = 8}
# add legend and bar to waffles
recs_shown_grid <- plot_grid(
  plot_grid(
    bar_recs_shown_plot,
    all_waffles_shown,
    rel_heights = c(1.2, 1.5),
    labels = c(
      "A) Percentage of total recommendations shown:",
      "B) Recommendations shown when watching:"
    ),
    label_y = c(.9, 1.1),
    label_size = 18,
    hjust = 0,
    nrow = 2
  ),
  common_legend_shown,
  ncol = 1,
  rel_heights = c(1, .1)
)
recs_shown_grid
ggsave('./main-text-figures_files/waffle_recs_shown_wtd.png',
      dpi = 600, width = 14, height = 8)
ggsave('./main-text-figures_files/waffle_recs_shown_wtd.pdf',
      dpi = 600, width = 14, height = 8)
```


## Figure 6: YouTube recommendations by subscription status and channel type

```{r recs by sub}
# select all subscription variables
sub_data <- activity_data %>% 
  select(ends_with("subscribed"),
         ends_with("unclassified"),
         -contains("adl"),
         user_id,
         samplegroup,
         caseid, weight_cmd) %>%
  # make names consistent
  rename(activity_yt_n_video_total_subscribed = activity_yt_n_video_subscribed,
         activity_yt_n_video_total_notsubscribed = activity_yt_n_video_notsubscribed,
         activity_yt_n_video_total_unclassified = activity_yt_n_video_unclassified,
         activity_yt_rec_n_video_total_subscribed = activity_yt_rec_n_total_subscribed,
         activity_yt_rec_n_video_total_notsubscribed = activity_yt_rec_n_total_notsubscribed,
         activity_yt_rec_n_video_total_unclassified = activity_yt_rec_n_total_unclassified,
         activity_yt_rec_match_n_video_total_subscribed = activity_yt_rec_match_n_video_subscribed,
         activity_yt_rec_match_n_video_total_notsubscribed = activity_yt_rec_match_n_video_notsubscribed,
         activity_yt_rec_match_n_video_total_unclassified = activity_yt_rec_match_n_video_unclassified) 


n_video_substatus <- 
  sub_data %>% 
    mutate(across(starts_with('activity_yt_n_video'), .f = ~weight_cmd*.x)) %>% 
    select(starts_with('activity_yt_n_video')) %>% 
    summarise_all(~sum(.x, na.rm = T)) %>% 
    pivot_longer(everything()) %>% 
    separate(col = name,
             sep = "(?:_[^_]+){3}",
             into = c("metric", "channel_type"),
             remove = TRUE) %>% 
    mutate(metric = 'activity_yt_n_video',
           channel_type = str_extract(channel_type, "[^_]\\w+_\\w+"))%>% 
    separate(col = channel_type,
             sep = "_",
             into = c("channel_type", "status"),
             remove = FALSE) %>% 
    pivot_wider(names_from = status, 
                values_from = value) %>% 
    mutate(total = subscribed + notsubscribed + unclassified) %>% 
    mutate(across(c(subscribed, notsubscribed, unclassified), 
                  .fns = function(.x) (.x/total))) %>% 
    select(metric, channel_type, subscribed, everything()) 

# recommendations shown
rec_n_video_substatus <- 
  sub_data %>% 
  mutate(across(starts_with('activity_yt_rec_n_video'), .f = ~weight_cmd*.x)) %>% 
  select(starts_with('activity_yt_rec_n_video')) %>% 
  summarise_all(~sum(.x, na.rm = T)) %>% 
  pivot_longer(everything()) %>% 
  separate(col = name,
           sep = "(?:_[^_]+){4}",
           into = c("metric", "channel_type"),
           remove = TRUE) %>% 
  mutate(metric = 'activity_yt_rec_n_video',
         channel_type = str_extract(channel_type, "[^_]\\w+_\\w+"))%>% 
  separate(col = channel_type,
           sep = "_",
           into = c("channel_type", "status"),
           remove = FALSE) %>% 
  pivot_wider(names_from = status, 
              values_from = value) %>% 
  mutate(total = subscribed + notsubscribed + unclassified) %>% 
  mutate(across(c(subscribed, notsubscribed, unclassified), 
                .fns = function(.x) (.x/total))) %>% 
  select(metric, channel_type, subscribed, everything()) 

# recommendations followed 
rec_match_n_video_substatus <- 
  sub_data %>% 
  mutate(across(starts_with('activity_yt_rec_match_n_video'), .f = ~weight_cmd*.x)) %>% 
  select(starts_with('activity_yt_rec_match_n_video')) %>% 
  summarise_all(~sum(.x, na.rm = T)) %>% 
  pivot_longer(everything()) %>% 
  separate(col = name,
           sep = "(?:_[^_]+){5}",
           into = c("metric", "channel_type"),
           remove = TRUE) %>% 
  mutate(metric = 'activity_yt_rec_match_n_video',
         channel_type = str_extract(channel_type, "[^_]\\w+_\\w+"))%>% 
  separate(col = channel_type,
           sep = "_",
           into = c("channel_type", "status"),
           remove = FALSE) %>% 
  pivot_wider(names_from = status, 
              values_from = value) %>% 
  mutate(total = subscribed + notsubscribed + unclassified) %>% 
  mutate(across(c(subscribed, notsubscribed, unclassified), 
                .fns = function(.x) (.x/total))) %>% 
  select(metric, channel_type, subscribed, everything()) 

total_visits_table <- recs_data %>%
    filter(variable == "total visits") %>%
    pivot_longer(-variable) %>%
    select(-variable) %>%
    mutate(
        overall_percent = 100 * (value / sum(value)),
        overall_percent = format(round(overall_percent, 2))
    )

combined <-
  rec_n_video_substatus %>%
  pivot_longer(
    cols = c(subscribed, notsubscribed, unclassified),
    names_to = "status",
    values_to = "prop"
  ) %>%
  mutate(
    stderr = sqrt((prop * (1 - prop)) / total),
    ci_lwr95 = prop - qt(.025, df = total, lower.tail = F) *
      stderr,
    ci_upr95 = prop + qt(.025, df = total, lower.tail = F) *
      stderr
  ) %>%
  bind_rows(
    rec_match_n_video_substatus %>%
      pivot_longer(
        cols = c(subscribed, notsubscribed, unclassified),
        names_to = "status",
        values_to = "prop"
      ) %>%
      mutate(
        stderr = sqrt((prop * (1 - prop)) / total),
        ci_lwr95 = prop - qt(.025, df = total, lower.tail = F) *
          stderr,
        ci_upr95 = prop + qt(.025, df = total, lower.tail = F) *
          stderr
      )
  ) %>%
  filter(status == "subscribed" & channel_type != "total") %>%
  mutate(
    metric = factor(
      ifelse(
        metric == "activity_yt_rec_n_video",
        "Recommendations shown",
        "Recommendations followed"
      ),
      levels = c("Recommendations shown",
                 "Recommendations followed")
    ),
    channel_type_label = case_when(
      channel_type == "alternative" ~ paste0("Alternative channel videos\n(",total_visits_table$overall_percent[1],"%)"),
      channel_type == "extremist" ~  paste0("Extremist channel videos\n(",total_visits_table$overall_percent[2],"%)"),
      channel_type == "mainstream" ~  paste0("Mainstream media channel videos\n(",total_visits_table$overall_percent[3],"%)"),
      channel_type == "other" ~  paste0("Other channel videos\n(",total_visits_table$overall_percent[4],"%)")
    ),
    total_label = scales::comma(total)
  ) %>%
  group_by(channel_type) %>%
  mutate(channel_type_sum = sum(total))


# only subscribed bars
combined %>%
  ggplot(aes(x = channel_type_label, 
             y = prop*100,
             fill = channel_type_label,
             pattern = metric)) +
  geom_col_pattern(position = position_dodge2(.9),
                   aes(color = channel_type_label),
                   pattern_color = "#212426",
                   pattern_fill = "#212426",
                   pattern_angle = 45,
                   #pattern_density = 0.06,
                   pattern_spacing = 0.05,
                   pattern_key_scale_factor = 0.2) +
  geom_text(aes(label = total_label, y = -5),
            position = position_dodge2(.9),
            size = 2.5) +
  geom_linerange(aes(ymin = ci_lwr95*100, ymax = ci_upr95*100),
                 position = position_dodge2(.9),
                 lwd = .8) +
  labs(x = "", y = "Percent subscribed to channel") +
  scale_y_continuous(limits = c(-10, 100),
                     breaks = seq(0, 100, by = 20),
                     labels = paste0(seq(0, 100, by = 20),"%") )+
  scale_fill_manual(values = color_palette,
                    name = "",
                    labels = c("Alternative channel\nvideos",
                               "Extremist channel\nvideos",
                               "Mainstream media\nchannel videos",
                               "Other channel\nvideos")) +
  scale_color_manual(values = color_palette,
                    name = "",
                    labels = c("Alternative channel\nvideos",
                               "Extremist channel\nvideos",
                               "Mainstream media\nchannel videos",
                               "Other channel\nvideos")) +
  scale_pattern_manual(name = "",
                       labels = c("Recommendations\nshown",
                                  "Recommendations\nfollowed"),
                       values = c("Recommendations shown" = "stripe",
                                  "Recommendations followed" = "circle")) +
  scale_pattern_density_manual(values = c(.5, .1)) +
  theme_minimal() +
  theme(strip.text = element_text(hjust = 0),
        panel.grid.major.x = element_blank(),
        legend.position = 'bottom')+
  guides(fill = "none",
         pattern = guide_legend(override.aes = list(fill = "white", 
                                                    color = "black")),
         color = "none")

ggsave('./main-text-figures_files/subscriptions_bars_wtd.png',
      dpi = 600,width = 11, height = 5.5)
ggsave('./main-text-figures_files/subscriptions_bars_wtd.pdf',
      dpi = 600,width = 11, height = 5.5)
```



## Figure 7: Relative frequency of referrals to YouTube videos by channel and referrer type

(See how `aggregated_referrers_by_channel` is constructed in `build.R` under "REFERRERS DATA")

```{r fig.height = 10}
# on platform panel
referrers_channel_type_videos_on_platform <-
  aggregated_referrers_by_channel %>%
  filter(on_off_platform == "on-platform" &
           all_referrers_grouped != "Other on-platform") %>%
  mutate(
    on_off_platform = case_when(
      on_off_platform == "on-platform" ~ "On-platform referrer",
      on_off_platform == "off-platform" ~ "Off-platform referrer"
    )
  ) %>%
  ggplot(
    data = .,
    aes(
      y = percentage,
      x = str_wrap_factor(reorder(all_referrers_grouped, order_var), width = 10),
      fill = channel_type,
      group = channel_type_referrer
    )
  ) +
  geom_bar(position = position_dodge(width = .9),
           stat = "identity") +
  geom_linerange(aes(ymin = ci_lwr, ymax = ci_upr),
                 position = position_dodge(width = .9),
                 size = .8) +
  #coord_cartesian(ylim = c(0, 50)) +
  scale_fill_manual(
    values = color_palette,
    labels = c(
      "Alternative channel\nvideos",
      "Extremist channel\nvideos",
      "Mainstream media channel\nvideos",
      "Other channel\nvideos"
    ),
    name = "Domains leading to:"
  ) +
  scale_y_continuous(
    limits = c(0, 60),
    breaks = seq(0, 60, 10),
    labels = paste0(seq(0, 60, 10), "%")
  ) +
  labs(x = "", y = "Estimated percentage from preceding domain") +
  facet_wrap( ~ on_off_platform, nrow = 2) +
  ggthemes::theme_gdocs() +
  theme(
    legend.position = 'bottom',
    legend.direction = 'horizontal',
    strip.text = element_text(
      size = 16,
      hjust = 1,
      color = "black"
    ),
    strip.background = element_rect(fill = "grey", color = "grey"),
    panel.grid.major.y = element_line(color = "grey95"),
    panel.grid.major.x = element_blank(),
    axis.text.x = element_text(size = 12),
    plot.background = element_rect(color = "white")
  )

# off platform panel
referrers_channel_type_videos_off_platform <-
  aggregated_referrers_by_channel %>%
  filter(on_off_platform == "off-platform" &
           all_referrers_grouped != "Other off-platform") %>%
  mutate(
    on_off_platform = case_when(
      on_off_platform == "on-platform" ~ "On-platform referrer",
      on_off_platform == "off-platform" ~ "Off-platform referrer"
    ),
    all_referrers_grouped = case_when(
      all_referrers_grouped == "alternative social" ~ "Alternative social",
      all_referrers_grouped == "webmail" ~ "Webmail",
      all_referrers_grouped == "search engine" ~ "Search engine",
      all_referrers_grouped == "main social" ~ "Mainstream social",
      all_referrers_grouped == "other off-platform" ~ "Other off-platform"
    )
  ) %>%
  ggplot(
    data = .,
    aes(
      y = percentage,
      x = str_wrap_factor(reorder(all_referrers_grouped, order_var), width = 10),
      fill = channel_type,
      group = channel_type_referrer
    )
  ) +
  geom_bar(position = position_dodge(width = .9),
           stat = "identity") +
  geom_linerange(aes(ymin = ci_lwr, ymax = ci_upr),
                 position = position_dodge(width = .9),
                 size = .8) +
  #coord_cartesian(ylim = c(0, 50)) +
  scale_y_continuous(
    limits = c(0, 60),
    breaks = seq(0, 60, 10),
    labels = paste0(seq(0, 60, 10), "%")
  ) +
  scale_fill_manual(
    values = color_palette,
    labels = c(
      "Alternative channel\nvideos",
      "Extremist channel\nvideos",
      "Mainstream media channel\nvideos",
      "Other channel\nvideos"
    ),
    name = "Domains leading to:"
  ) +
  labs(x = "", y = "Estimated percentage from preceding domain") +
  facet_wrap( ~ on_off_platform, nrow = 2) +
  ggthemes::theme_gdocs() +
  theme(
    legend.position = 'bottom',
    legend.direction = 'horizontal',
    strip.text = element_text(
      size = 16,
      hjust = 1,
      color = "black"
    ),
    strip.background = element_rect(fill = "grey", color = "grey"),
    panel.grid.major.y = element_line(color = "grey95"),
    panel.grid.major.x = element_blank(),
    axis.text.x = element_text(size = 12),
    plot.background = element_rect(color = "white")
  )

common_legend_channel_type <- get_legend(referrers_channel_type_videos_off_platform)

plot_grid(
  plot_grid(
    referrers_channel_type_videos_on_platform +
      theme(legend.position = 'none'),
    referrers_channel_type_videos_off_platform +
      theme(legend.position = 'none'),
    labels = c("A", "B"),
    label_size = 30,
    nrow = 2
  ),
  common_legend_channel_type,
  ncol = 1,
  rel_heights = c(1, .1)
)

ggsave('./main-text-figures_files/referrers_channel_type_videos_wtd.png',
      dpi = 600, width = 12, height = 12)
ggsave('./main-text-figures_files/referrers_channel_type_videos_wtd.pdf',
      dpi = 600, width = 12, height = 12)
```


### Survey measures of racial resentment and hostile sexism stats (page 10)

Racial resentment and hostile sexism measures were also included in our 2020 survey; responses showed a high degree of persistence over time ($r=.92$ for racial resentment, $r =.79$ for hostile sexism).

```{r persist rr}
#persistence racial resentment
cor(merged_data$rr_blk_mean, merged_data$rr_cts,
    use = "complete.obs",
    method = 'pearson')
```

```{r persist hostile sexism}
# hostile sexism
cor(merged_data$fem_mean, merged_data$fem_cts,
    use = "complete.obs",
    method = 'pearson')
```



### what % of alt and ext channel video views are from the high gender/racial resentment oversample? what % from high YT oversample?

n_views_alternative_RRgroup / total_views_alternative

1. total_views_alternative = sum(n_video_views_alternative)
2. n_views_alternative_RRgroup = sum(n_video_views_alternative | subgroup = RR)

```{r}
#samplegroup = 2 is high RR
oversample_analysis <- merged_data %>%
  select(user_id, samplegroup, browser_sample, weight_cmd,
         rr_2018 = rr_cts, 
         rr_2020 = rr_blk_mean,
         gr_2018 = fem_cts,
         gr_2020 = fem_mean,
         activity_yt_n_video_alternative_all,
         activity_yt_n_video_extremist_all,
         activity_yt_video_time_elapsed_capped_total_alternative_all, # in seconds
         activity_yt_video_time_elapsed_capped_total_extremist_all)  %>%
  mutate(samplegroup = case_when(
    samplegroup ==1 ~ "CCES 2018 recontact",
    samplegroup ==2 ~ "CCES 2018 with high racial resentment recontact",
    samplegroup ==3 ~ "High YouTube users"
  ))
  
browser_oversample_analysis <- oversample_analysis %>%
  filter(browser_sample == 'browser')

dvs <- c(
  "activity_yt_n_video_alternative_all",
  "activity_yt_n_video_extremist_all",
  "activity_yt_video_time_elapsed_capped_total_alternative_all",
  "activity_yt_video_time_elapsed_capped_total_extremist_all"
)

raw_numbers <- map_dfr(
  dvs,
  ~ browser_oversample_analysis  %>%
    group_by(samplegroup)  %>%
    summarise(raw_subgroup_n = sum(.data[[.x]], na.rm = T)) %>%
    ungroup()  %>%
    mutate(raw_total_n = sum(raw_subgroup_n),
           raw_prop_n = raw_subgroup_n / raw_total_n,
           stat = .x)
);raw_numbers


weighted_numbers <- map_dfr(
  dvs,
  ~ browser_oversample_analysis %>%
    mutate(wtd_stat = .data[[.x]] * weight_cmd) %>%
    group_by(samplegroup) %>%
    summarise(wtd_subgroup_n = sum(wtd_stat, na.rm = T))  %>%
    ungroup()  %>%
    mutate(wtd_total_n = sum(wtd_subgroup_n),
           wtd_prop_n = wtd_subgroup_n / wtd_total_n,
           stat = .x)
);weighted_numbers

visit_time_subgroup <- bind_cols(raw_numbers,
                    weighted_numbers  %>% select(-samplegroup,-stat))  %>%
  select(samplegroup, stat, everything())  %>% 
  mutate(
        channel_type = case_when(
    str_detect(stat, "alternative") ~ "Alternative", 
    str_detect(stat, "extremist") ~ "Extremist"
    ),
    stat = case_when(
      stat == 'activity_yt_n_video_alternative_all' ~ "visit",
            stat == 'activity_yt_n_video_extremist_all' ~ "visit",
            stat == 'activity_yt_video_time_elapsed_capped_total_alternative_all' ~ "time (secs)",
            stat == 'activity_yt_video_time_elapsed_capped_total_extremist_all' ~ "time (secs)"
      
    )
  ) %>%
  rename(`Unwweighted prop` = raw_prop_n,
         `Weighted prop` = wtd_prop_n,
         `Unwweighted n` = raw_subgroup_n,
         `Unwweighted total` = raw_total_n,
         `Weighted n` = wtd_subgroup_n,
         `Weighted total` = wtd_total_n) %>%
  select(samplegroup, channel_type, stat,
         everything())


visit_time_subgroup  %>%
  kbl(digits = 2, 
      booktabs = T,
      format = "latex") %>% 
  kable_styling() %>% 
  save_kable("main-text-figures_files/exposure_by_subgroup.tex",float = FALSE)
```

